import numpy as npF
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, roc_auc_score
from pandas_profiling import ProfileReport
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from yellowbrick.classifier import ClassificationReport, ROCAUC
# Adjust pandas display and formatting settings
# Remove scientific notations and display numbers with 2 decimal points instead
pd.options.display.float_format = '{:,.5f}'.format
# Increase cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
# Update default style and size of charts
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [20, 10]
# Increase max number of rows and columns to display in pandas tables
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
df= pd.read_csv('bank-full.csv')
df.head()
#There are 45211 rows
df.shape
#Most of the fields are object i.e Categorical and rest are integer fileds
#and also there are no integer columns where has garbage data which will change datatype of integer to Object
#Looks fine at the first glance
df.info()
#There are no missing values
df.isnull().sum()
df.describe()
profile = df.profile_report(html={'style':{'full_width':True}}) #syntax for v2.6 version of pandas_profiling that works with pandas v1.0+
profile
df.describe()
fieldDescription = {
"age": "Age of the Customer",
"balance": "Balance in Customers account",
"day": "Last contact Day of the Month",
"duration" :"Last contact duration, in seconds",
"campaign":"Number of contacts performed during this campaign and for this client",
"pdays":"Number of days that passed by after the client was last contacted from a previous campaign",
"previous":"Number of contacts performed before this campaign and for this client",
"job":"Type of job (management, technician, entrepreneur, blue-collar, etc.)",
"marital":"marital status (married, single, divorced)",
"education": "education level (primary, secondary, tertiary)",
"default": "has credit in default?",
"housing": "has housing loan?",
"loan": "has personal loan?",
"month": "last contact month of year",
"poutcome": "outcome of the previous marketing campaign",
"Target": "Tell us has the client subscribed a term deposit. (Yes, No)",
"contact": "contact communication type"
}
def findOutliers(column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
median = df[column].quantile(0.50)
mean=df[column].mean()
IQR = Q3 - Q1
skewed='NotSkewed';
if(mean>median):
skewed='RightSkewed'
elif (mean<median):
skewed='LeftSkewed'
df_new = pd.DataFrame(columns=['Field','Description', 'Total No of Data','Min','Max','Mean','Median(Q2)','Std Dev','Q1','Q3','Total No of Missing','Total Number of Outliers','Skewed','Lowerbound','UpperBound','No of Outliers under lowerbound','No of Outliers above Upperbound'])
df_new=df_new.append({'Field':column,
'Description':fieldDescription[column],
'Total No of Data':df[column].count() ,
'Min':df[column].min() ,
'Max':df[column].max() ,
'Mean':mean,
'Median(Q2)':median,
'Std Dev':df[column].std(),
'Q1':Q1,
'Q3':Q3,
'Total No of Missing':df[column].isnull().sum(),
'Total Number of Outliers': df[(df[column] < (Q1 - 1.5 * IQR)) |(df[column] > (Q3 + 1.5 * IQR))][column].count(),
'Lowerbound': Q1 - 1.5 * IQR,
'UpperBound': Q3 + 1.5 * IQR,
'No of Outliers under lowerbound':df[df[column] < (Q1 - 1.5 * IQR)][column].count(),
'No of Outliers above Upperbound' :df[df[column] > (Q3 + 1.5 * IQR)][column].count(),
'Skewed':skewed
},
ignore_index=True)
return df_new
age_data=findOutliers('age')
balance_data=findOutliers('balance')
day_data=findOutliers('day')
duration_data=findOutliers('duration')
campaign_data=findOutliers('campaign')
pdays_data=findOutliers('pdays')
previous_data=findOutliers('previous')
pd.concat([age_data,balance_data,day_data,duration_data,campaign_data,pdays_data,previous_data])
df.describe()
df['Target'].value_counts()
for col in df.columns:
if (df[col].dtype=="object"):
print("FieldName :",col),
print("FieldDescription :",fieldDescription[col]),
print(df[col].value_counts()),
print("\n")
#There are 210 out of who 5289 have negative bank balance and have Term Deposit
len(df[(df['balance']<0) & df['Target']==1])
def plotHistBox(column):
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
# Add a graph in each part
sns.boxplot(df[column], ax=ax_box)
sns.distplot(df[column], ax=ax_hist)
# Remove x axis name for the boxplot
ax_box.set(xlabel='')
plotHistBox('age')
lotHistBox('balance')
plotHistBox('campaign')
plotHistBox('day')
plotHistBox('duration')
plotHistBox('pdays')
plotHistBox('previous')
df.head()
#Most of the people in Dataset are Blue collar and Management
sns.countplot(df["job"])
#Most of the people are married in dataset
sns.countplot(df["marital"])
#Most of the people in Dataset have secondary education
sns.countplot(df['education'])
#Most of the people in Dataset has credit in default
sns.countplot(df['default'])
#Most of the people in Dataset have housing loan
sns.countplot(df['housing'])
#Most of the people in Dataset have no personal loan
sns.countplot(df['loan'])
#Most of the people were contacted over the phone
sns.countplot(df['contact'])
#Most of the people in Dataset were contacted in the month of May
sns.countplot(df['month'])
#Most of the previous outcome in Dataset are unknow
sns.countplot(df['poutcome'])
1)There are no missing values in the data 2)WIll be applying Feature Scaling technique (StandardSCaler) to get all the data to same scale as Balance field has bigger vlaues than other fields 3)Will keep outliers as is ,as only LogisticRegression are sensitive to outliers.Will run Ensemble techniques like RandomForest,XgBoost to check the model with outliers 4)Will be replace yes to 1 and no to 0 in all the columns to make them numeric fields instead of Object 5)Will convert categorical data to numeric using dummies before fitting the data to the model
#Replace all the yes with 1 and no with 0
df.replace({"yes": 1, "no": 0},inplace=True)
df.corr()
#Dont see any strong correlation with any of the fields
sns.heatmap(df.corr(),annot=True,cmap="RdYlGn")
#By looking at the histogram,Dont see any independent variable influencing the Target Variable.0 and 1 both are overlapping in all the indivaidual variables
sns.pairplot(df,hue="Target")
#Management ,Technician and Blue collar are the most who subscribed for Term Deposit
sns.countplot(x=df['job'],hue=df["Target"])
#Married and Singles are more in number who subscribed for Term Deposit
sns.countplot(x=df['marital'],hue=df["Target"])
# People with education level of Secondary and tertiary are the most who subscribed for Term Deposit
sns.countplot(x=df['education'],hue=df["Target"])
# People with no credit in defualt are the ones who subscribed for Term deposit.Should target the people with no default
sns.countplot(x=df['default'],hue=df["Target"])
#More number with no hosuing loan acquired Term deposit
sns.countplot(x=df['housing'],hue=df["Target"])
#Most of people with no person loan are the ones who subscribed for Term Deposit
sns.countplot(x=df['loan'],hue=df["Target"])
#Most of the people who were contacted through phone are the ones who subscribed for Term deposit
sns.countplot(x=df['contact'],hue=df["Target"])
#Most of the people were contacted in the month of May but there is no significant pattern in terms of month .
sns.countplot(x=df['month'],hue=df["Target"])
#Most of the previous outcome are Unknown who subscribed for Term Deposit
sns.countplot(x=df['poutcome'],hue=df["Target"])
#Checking relation with Age and Term Deposit
agebins = np.arange(0, 100, 5)
agelabels =np.arange(5, 100, 5)
df['binnedAge'] = pd.cut(df['age'], agebins,labels=agelabels)
#Most of People who acquired Termed deposit are in the ag limit of 25 to 70.Peak being 35-40 age
sns.countplot(x=df['binnedAge'],hue=df["Target"])
#Most of the people who were contacted 1,2 or 3 times during the campaign acquired Term Deposit
sns.countplot(x=df['campaign'],hue=df["Target"])
#Cant derive much from the below graph
g = sns.catplot(x="job", y="balance", hue="Target",
col="education", data=df)
var=g.set_xticklabels(labels=df['job'],rotation=60)
#Cant derive much from the below graph
g = sns.catplot(x="job", y="balance",
col="Target", data=df)
var=g.set_xticklabels(labels=df['job'],rotation=60)
#Cant derive much from the below graph
g = sns.catplot(x="education", y="balance",
col="Target", data=df)
var=g.set_xticklabels(labels=df['job'],rotation=60)
#We can see that most clients were contacted more than once through cellular and telephone.
g = sns.catplot(x="contact", y="campaign", hue="Target",
data=df)
#Cant make much of the below graph
g = sns.catplot(x="contact", y="duration", hue="Target",col="education",
data=df)
#All the DataType looks fine
df.info()
df.drop(columns=['binnedBalance'],inplace=True)
df = pd.get_dummies(df, columns = ['job','marital','education','contact','month','poutcome'], drop_first = True)
df.head()
# Splitting the dataset into the Training set and Test set
X=df.drop('Target',axis=1)
y=df['Target']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state =0)
X_train.shape,X_test.shape
y_train.shape,y_test.shape
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
# Creating a function for evaluating model performance
def evaluate_model_performance(model, X_train, X_test,y_train,y_test):
accuracy_score_train= accuracy_score(y_train,model.predict(X_train));
accuracy_score_test= accuracy_score(y_test,model.predict(X_test));
f1_score_train= f1_score(y_train,model.predict(X_train));
f1_score_test= f1_score(y_test,model.predict(X_test));
auc_train = roc_auc_score(y_train, model.predict_proba(X_train)[:,1])
auc_test = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
recall_train = recall_score(y_train, model.predict(X_train))
recall_test = recall_score(y_test, model.predict(X_test))
precision_train = precision_score(y_train, model.predict(X_train))
precision_test = precision_score(y_test, model.predict(X_test))
return accuracy_score_train,accuracy_score_test,f1_score_train,f1_score_test,auc_train, auc_test, recall_train, recall_test, precision_train, precision_test
## Define the model
lg = LogisticRegression(solver = 'liblinear',random_state=22)
## Train the model
lg.fit(X_train, y_train)
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, lg.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
# Produce test set performance metrics
scores = evaluate_model_performance(lg, X_train, X_test,y_train,y_test)
# Store results in a dataframe for comparison
resultsmatrix = pd.DataFrame({'Model': ['Logistic Regression'],
'Accuracy Train':scores[0],
'Accuracy Test':scores[1],
'F1 Train':scores[2],
'F1 Test':scores[3],
'AUC Train': scores[4],
'AUC Test': scores[5],
'Recall Train': scores[6],
'Recall Test': scores[7],
'Precision Train': scores[8],
'Precision Test': scores[9]})
resultsmatrix
#Fine tuning SVC to see if we have optimal parameters.Couldnt run for different parameters cos of computer limitation
from sklearn.model_selection import GridSearchCV
# Create regularization penalty space
penalty = ['l1', 'l2']
# Create regularization hyperparameter space
C = np.logspace(0, 4, 10,100)
# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)
# Create grid search using 5-fold cross validation
grid_search = GridSearchCV(lg, hyperparameters, cv=5, verbose=0)
grid_search = grid_search.fit(X_train, y_train)
print("tuned hpyerparameters :(best parameters) ",grid_search.best_params_)
print("accuracy :",grid_search.best_score_)
lg_grid = LogisticRegression(solver = 'liblinear',C= 2.7825594022071245, penalty = 'l2',random_state=22)
## Train the model
lg_grid.fit(X_train, y_train)
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, lg_grid.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
# Produce test set performance metrics
scores = evaluate_model_performance(lg_grid, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["Logistic Regression - With Grid Search ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
from sklearn.model_selection import RandomizedSearchCV
# Create regularization penalty space
penalty = ['l1', 'l2']
# Create regularization hyperparameter space
C = np.logspace(0,0.5, 4, 10,100)
# Create hyperparameter options
param_distributions = dict(C=C, penalty=penalty)
# Create grid search using 5-fold cross validation
randomsearch = RandomizedSearchCV(lg, param_distributions, n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)
randomsearch = randomsearch.fit(X_train, y_train)
print("tuned hpyerparameters :(best parameters) ",randomsearch.best_params_)
print("accuracy :",randomsearch.best_score_)
lg_randomSearch = LogisticRegression(solver = 'liblinear',C= 1.0, penalty = 'l1',random_state=22)
## Train the model
lg_randomSearch.fit(X_train, y_train)
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, lg_randomSearch.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
# Produce test set performance metrics
scores = evaluate_model_performance(lg_randomSearch, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["Logistic Regression - With Random Search ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
dTree = DecisionTreeClassifier(criterion='gini',random_state=22)
dTree.fit(X_train, y_train)
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, dTree.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
# Produce test set performance metrics
scores = evaluate_model_performance(dTree, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["Decision Tree ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
#Check HyperParameter Using GridSearch
tree_para = {'criterion':['gini','entropy'],
'max_depth':[3,4,5,6,7,8,9,10,15,20,30,40,50,100],
'max_features':[None,"auto", "sqrt", "log2"],
'min_samples_leaf' :[1,10,20,30]}
clf = GridSearchCV(dTree, tree_para, cv=5)
clf.fit(X_train, y_train )
print("tuned hpyerparameters :(best parameters) ",clf.best_params_)
print("accuracy :",clf.best_score_)
dTree_gridSearch = DecisionTreeClassifier(criterion='gini',max_depth=6,min_samples_leaf=30,random_state=22)
dTree_gridSearch.fit(X_train, y_train)
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, dTree_gridSearch.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
# Produce test set performance metrics
scores = evaluate_model_performance(dTree_gridSearch, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["Decision Tree with Grid Search ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
#Check HyperParameter Using RandomizedSearchCV
tree_para = {'criterion':['gini','entropy'],
'max_depth':[3,4,5,6,7,8,9,10,15,20,30,40,50,100],
'max_features':[None,"auto", "sqrt", "log2"],
'min_samples_leaf' :[1,10,20,30]}
dtree_random = RandomizedSearchCV(dTree, param_distributions=tree_para, n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)
dtree_random.fit(X_train, y_train )
print("tuned hpyerparameters :(best parameters) ",dtree_random.best_params_)
print("accuracy :",dtree_random.best_score_)
dTree_randomizedSearchCV = DecisionTreeClassifier(criterion='gini',max_depth=8,min_samples_leaf=20,random_state=22)
dTree_randomizedSearchCV.fit(X_train, y_train)
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, dTree_randomizedSearchCV.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
# Produce test set performance metrics
scores = evaluate_model_performance(dTree_randomizedSearchCV, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["Decision Tree with Randomized Search ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(random_state=22)
rfcl = rfcl.fit(X_train, y_train)
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, rfcl.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
scores = evaluate_model_performance(rfcl, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["Random Forest ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
#Check HyperParameter Using GridSearch
from scipy.stats import randint
randomtree_para ={'max_depth':[3,5,10,50,100,None],
'n_estimators':[10,50,100,200],
'max_features': [None,'auto', 'sqrt', 'log2'],
'criterion':['gini','entropy'],
'bootstrap':[True,False],
'min_samples_leaf':[5,10,20,30]#,
# 'max_samples':[100,200,300]
}
clf = GridSearchCV(rfcl, randomtree_para, cv=5)
clf.fit(X_train, y_train )
print("tuned hpyerparameters :(best parameters) ",clf.best_params_)
print("accuracy :",clf.best_score_)
rfcl_gridsearch = RandomForestClassifier(criterion = 'gini',bootstrap=False,max_features='auto',n_estimators=50)
#rfcl_gridsearch = RandomForestClassifier()
rfcl_gridsearch = rfcl_gridsearch.fit(X_train, y_train)
#dTree_gridSearch.fit(X_train, y_train)
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, rfcl_gridsearch.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
scores = evaluate_model_performance(rfcl_gridsearch, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["Random Forest with Grid Search ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
est = RandomForestClassifier(n_jobs=-1)
rf_p_dist ={'max_depth':[3,5,10,50,100,None],
'n_estimators':[10,50,100,200,300,400,500],
'max_features': [None,'auto', 'sqrt', 'log2'],
'criterion':['gini','entropy'],
'bootstrap':[True,False],
'min_samples_leaf':[5,10,20,30]#,
# 'max_samples':[100,200,300]
}
rdmsearch = RandomizedSearchCV(est, param_distributions=rf_p_dist,
n_jobs=-1, n_iter=5, cv=10)
rdmsearch.fit(X_train,y_train)
print("tuned hpyerparameters :(best parameters) ",rdmsearch.best_params_)
print("accuracy :",rdmsearch.best_score_)
rfcl_randomsearch = RandomForestClassifier(criterion = 'gini',bootstrap=False,max_features='sqrt',n_estimators=50,min_samples_leaf=5,max_depth=50)
#rfcl_gridsearch = RandomForestClassifier()
rfcl_randomsearch = rfcl_gridsearch.fit(X_train, y_train)
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, rfcl_randomsearch.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
scores = evaluate_model_performance(rfcl_randomsearch, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["Random Forest with Random Search ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier()
bgcl = bgcl.fit(X_train, y_train)
scores = evaluate_model_performance(bgcl, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["Bagging Classifier",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, bgcl.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
from sklearn.ensemble import AdaBoostClassifier
#abcl = AdaBoostClassifier(base_estimator=dt_model, n_estimators=50)
abcl = AdaBoostClassifier()
abcl = abcl.fit(X_train, y_train)
scores = evaluate_model_performance(abcl, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["AdaBoostClassifier",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, abcl.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier()
gbcl = gbcl.fit(X_train, y_train)
scores = evaluate_model_performance(gbcl, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["GradientBoostingClassifier",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, gbcl.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
from xgboost import XGBClassifier
XGBClass= XGBClassifier()
XGBClass = XGBClass.fit(X_train, y_train)
scores = evaluate_model_performance(XGBClass, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["XGBClassifier",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
{"classifier": [LogisticRegression()],
"classifier__penalty": ['l2','l1'],
"classifier__C": np.logspace(0, 4, 10)
},
{"classifier": [LogisticRegression()],
"classifier__penalty": ['l2'],
"classifier__C": np.logspace(0, 4, 10),
"classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
},
{"classifier": [RandomForestClassifier()],
"classifier__n_estimators": [10, 50, 100],
"classifier__max_depth":[5,8,15,25,30,None],
"classifier__min_samples_leaf":[1,2,5,10,15,100],
"classifier__max_leaf_nodes": [2, 5,10]},
{"classifier": [AdaBoostClassifier()],
"classifier__n_estimators": [10,20, 50, 100],
"classifier__learning_rate" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30,1.0 ]
},
{"classifier": [GradientBoostingClassifier()],
"classifier__n_estimators": [10, 50, 100],
"classifier__learning_rate" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ]
},
{
"classifier": [XGBClassifier()] ,
"classifier__n_estimators": [10, 50, 100,200,300,400,500],
"classifier__learning_rate" : [0.01,0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
"classifier__max_depth" : [ 3, 4, 5, 6, 8, 10, 12, 15,20,50,60,70],
"classifier__min_child_weight" : [ 1, 3, 5, 7 ],
"classifier__gamma" : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
"classifier__colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
},
{
"classifier": [BaggingClassifier()] ,
"classifier__n_estimators": [10, 50, 100],
"classifier__max_samples": [0.7,0.8]
}
]
# create a gridsearch of the pipeline, the fit the best model
#gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
#best_model = gridsearch.fit(X_train,y_train)
random_search=RandomizedSearchCV(pipe,param_distributions=grid_param,n_iter=5,scoring='recall',n_jobs=-1,cv=5,verbose=3)
best_model=random_search.fit(X_train,y_train)
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))
print("The best model params are:",random_search.best_params_)
XGBclass_tuned =XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
colsample_bynode=None, colsample_bytree=0.4, gamma=0.1,
gpu_id=None, importance_type='gain', interaction_constraints=None,
learning_rate=0.25, max_delta_step=None, max_depth=6,
min_child_weight=7, missing=np.nan, monotone_constraints=None,
n_estimators=300, n_jobs=None, num_parallel_tree=None,
objective='binary:logistic', random_state=None, reg_alpha=None,
reg_lambda=None, scale_pos_weight=None, subsample=None,
tree_method=None, validate_parameters=None, verbosity=None)
XGBclass_tuned = XGBclass_tuned.fit(X_train, y_train)
scores = evaluate_model_performance(XGBclass_tuned, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["XGBClassifier Random Search",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, XGBclass_tuned.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
knnclassifier = KNeighborsClassifier()
print(cross_val_score(knnclassifier,X_train,y_train,cv=10,scoring='accuracy').mean())
knnclassifier.fit(X_train,y_train)
scores = evaluate_model_performance(knnclassifier, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["knnclassifier ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
#cross_val_score on DecisionTree
from sklearn.tree import DecisionTreeClassifier
decisionTree = DecisionTreeClassifier(random_state=0)
print(cross_val_score(decisionTree,X_train,y_train,cv=10,scoring='accuracy').mean())
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
print(cross_val_score(rf,X_train,y_train,cv=10,scoring='accuracy').mean())
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
X_train_poly= poly.fit_transform(X_train)
logreg = LogisticRegression()
print(cross_val_score(logreg,X_train_poly,y_train,cv=10,scoring='accuracy').mean())
logreg.fit(X_train,y_train)
scores = evaluate_model_performance(logreg, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["Polynomial Logistic Regression ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
#cross_val_score on LogRegression
from sklearn.svm import SVC
svcClassifier = SVC()
print(cross_val_score(svcClassifier,X_train,y_train,cv=10,scoring='accuracy').mean())
svcClassifier.fit(X_train,y_train)
scores = evaluate_model_performance(logreg, X_train, X_test,y_train,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["Scalar Vector Classifier ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
Here more focus should be towards recall because our target variable is acquiring 'Term Deposit' , i.e whether the customer is accepting the term deposit loan or not. And the bank wants more people to accept term deposit i.e. less number of False Negative, so that bank doesn't lose real customers who want to take Term deposit. Hence the focus should be on increasing Recall.
From the above,it can be seen that XGBClassifier is a good model followed by Decsion Tree model as the Recall is better on Test set for these models compared to others
Since the Recall percentage is not that good Will try to perform Oversampling of Minority class using SMOTE(Synthetic Minority OverSampling Technique)
!pip install -U imbalanced-learn
#WIll have to check if we rectify the Class Imbalance will the Recall improve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from pylab import rcParams
unique, count = np.unique(y_train, return_counts=True)
Y_train_dict_value_count = { k:v for (k,v) in zip(unique, count)}
Y_train_dict_value_count
sm = SMOTE(random_state=12)
x_train_res, y_train_res = sm.fit_sample(X_train, y_train)
unique, count = np.unique(y_train_res, return_counts=True)
y_train_smote_value_count = { k:v for (k,v) in zip(unique, count)}
y_train_smote_value_count
xgbClass_smothe= XGBClassifier().fit(x_train_res, y_train_res)
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, xgbClass_smothe.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
scores = evaluate_model_performance(xgbClass_smothe, x_train_res, X_test,y_train_res,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["XGBClassifier - Smothe ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
# compare ensemble to each baseline classifier
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from matplotlib import pyplot
# get the dataset
#def get_dataset():
# X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=1)
# return X, y
# get a stacking ensemble of models
def get_stacking():
# define the base models
level0 = list()
level0.append(('lr', LogisticRegression()))
level0.append(('knn', KNeighborsClassifier()))
level0.append(('cart', DecisionTreeClassifier()))
level0.append(('svm', SVC()))
level0.append(('bayes', GaussianNB()))
# define meta learner model
level1 = XGBClassifier()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
return model
# get a list of models to evaluate
def get_models():
models = dict()
models['lr'] = LogisticRegression()
models['knn'] = KNeighborsClassifier()
models['cart'] = DecisionTreeClassifier()
models['svm'] = SVC()
models['bayes'] = GaussianNB()
models['stacking'] = get_stacking()
return models
# evaluate a give model using cross-validation
def evaluate_model(model):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1, error_score='raise')
#scores = cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1, error_score='raise')
return scores
# define dataset
#X, y = get_dataset()
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
# compare ensemble to each baseline classifier
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from matplotlib import pyplot
# get the dataset
#def get_dataset():
# X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=1)
# return X, y
# get a stacking ensemble of models
def get_stacking():
# define the base models
level0 = list()
level0.append(('lr', LogisticRegression()))
level0.append(('knn', KNeighborsClassifier()))
level0.append(('cart', DecisionTreeClassifier()))
level0.append(('svm', SVC()))
level0.append(('bayes', GaussianNB()))
# define meta learner model
level1 = XGBClassifier()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
return model
# get a list of models to evaluate
def get_models():
models = dict()
models['lr'] = LogisticRegression()
models['knn'] = KNeighborsClassifier()
models['cart'] = DecisionTreeClassifier()
models['svm'] = SVC()
models['bayes'] = GaussianNB()
models['stacking'] = get_stacking()
return models
# evaluate a give model using cross-validation
def evaluate_model(model):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, x_train_res, y_train_res, scoring='recall', cv=cv, n_jobs=-1, error_score='raise')
#scores = cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1, error_score='raise')
return scores
# define dataset
#X, y = get_dataset()
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()
From the above we can see ,the Recall rate improved drastically on SMOTE train dataset on most of the models.WIll choose KNN and Stacking to check my predictions
Recall rate on Actual Dataset
lr 0.208 (0.025) knn 0.275 (0.018) cart 0.482 (0.017) svm 0.009 (0.004) bayes 0.506 (0.016) stacking 0.267 (0.029)
Recall rate on SMOTE Dataset
lr 0.848 (0.007) knn 0.990 (0.002) cart 0.921 (0.006) svm 0.952 (0.004) bayes 0.530 (0.009) stacking 0.969 (0.003)
knn=KNeighborsClassifier().fit(x_train_res,y_train_res)
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, knn.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
scores = evaluate_model_performance(knn, x_train_res, X_test,y_train_res,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["KNN - Smote dataset ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
stacking=get_stacking().fit(x_train_res,y_train_res)
# Produce a confusion matrix with actual and predicted outcomes
pd.crosstab(y_test, stacking.predict(X_test), rownames = ['Actual'], colnames = ['Predictions'])
scores = evaluate_model_performance(stacking, x_train_res, X_test,y_train_res,y_test)
resultsmatrix.loc[len(resultsmatrix)] = ["Stacking - Smote dataset ",
scores[0],
scores[1],
scores[2],
scores[3],
scores[4],
scores[5],
scores[6],
scores[7],
scores[8],
scores[9]
]
resultsmatrix
Here more focus should be towards recall because our target variable is acquiring 'Term Deposit' , i.e whether the customer is accepting the term deposit loan or not. And the bank wants more people to accept term deposit i.e. less number of False Negative, so that bank doesn't lose real customers who want to take Term deposit. Hence the focus should be on increasing Recall.
From the above,it can be seen that (KNN - Smote dataset) is a good model followed by (XGBClassifier - Smothe) as the Recall is better on Test set for these models compared to others